

***** Clean Raw Gazole Data *****
foreach i of numlist 2007/2018 {
clear 
import delimited "$path\Raw Data\Base_prix`i'_gazole.csv", delim(";") varnames(1)

drop v1
drop if id_pdv=="NA" | date_ma=="NA" | id_carburant=="NA"  | prix=="NA"
destring prix, replace force


gen jour_maj=substr(date_maj,1,10)
gen heure_maj=substr(date_maj,12,8)
drop date_maj

*Generate a variable indicating hours
capture drop ts
egen str ts=concat(jour_maj heure_maj), punct(" ")
capture drop dt
gen double dt = clock(ts, "YMD hms")
format dt %tc


gen annee=substr(jour_maj,1,4)
gen mois=substr(jour_maj,6,2)
gen jour=substr(jour_maj,9,2)
destring annee, replace force
destring mois, replace force
destring jour, replace force

gen date_maj = mdy(mois, jour, annee)
format date_maj %d
drop jour_maj annee mois jour
drop heure_maj
drop ts
rename dt full_date_maj


save "$path\Intermediary Data\Base_prix`i'_gazole_toappend.dta", replace
}


// Append all
clear
use "$path\Intermediary Data\Base_prix2007_gazole_toappend.dta"
append using "$path\Intermediary Data\Base_prix2008_gazole_toappend.dta"
append using "$path\Intermediary Data\Base_prix2009_gazole_toappend.dta"
append using "$path\Intermediary Data\Base_prix2010_gazole_toappend.dta"
append using "$path\Intermediary Data\Base_prix2011_gazole_toappend.dta"
append using "$path\Intermediary Data\Base_prix2012_gazole_toappend.dta"
append using "$path\Intermediary Data\Base_prix2013_gazole_toappend.dta"
append using "$path\Intermediary Data\Base_prix2014_gazole_toappend.dta"
append using "$path\Intermediary Data\Base_prix2015_gazole_toappend.dta"
append using "$path\Intermediary Data\Base_prix2016_gazole_toappend.dta"
append using "$path\Intermediary Data\Base_prix2017_gazole_toappend.dta"
append using "$path\Intermediary Data\Base_prix2018_gazole_toappend.dta"

// Transform tax-included prices into tax-excluded prices
do "$path\Codes\Prepare_IntermediaryData\02_TICPE.do"


capture drop num_obs
capture drop first_obs

sort id_pdv full_date_maj
capture drop num_obs
gen num_obs=_n

// 0. Preliminary: identify cases where same observations on same exact hour

*** First case: same exact price and hour: drop duplicates
duplicates drop id_pdv full_date_maj prix, force

*** Second case: same exact hour but not same price --> keep highest value
duplicates tag id_pdv full_date_maj, gen(tag)
bysort id_pdv full_date_maj: egen prix_max=max(prix)
drop if prix!=prix_max & tag!=0 & tag!=.
capture drop tag
capture drop prix_max
duplicates drop id_pdv full_date_maj, force /* Last duplicates drop for check: nothing should be dropped */



// 1. Suppress "too low" price levels which correspond to "wrong" updates 
xtile xt_prix_ht=prix_ht, nq(200)
drop if xt_prix_ht==200 | xt_prix_ht==1
capture drop xt_prix_ht


** Keep only one observation per price, namely the first one
capture drop same_price_asbefore
bysort id_pdv (num_obs): gen same_price_asbefore=1 if prix==prix[_n-1]
keep if same_price_asbefore==.
drop same_price_asbefore

*** Keep one observation per day, namely the last one 
capture drop same_price_asbefore
capture drop same_day_asafter
bysort id_pdv (num_obs): gen same_day_asafter=1 if date_maj==date_maj[_n+1]
drop if same_day_asafter==1
capture drop same_day_asafter

// 2. Drop outliers in terms of stations

// 2.1 - Drop stations depending on median time interval between two changes
bysort id_pdv (num_obs): gen timeinterval_lastchange=date_maj-date_maj[_n-1]
bysort id_pdv: egen median_timeinterval=median(timeinterval_lastchange)

** Check deciles 
bysort id_pdv (num_obs): gen first_obs=1 if _n==1
summ median_timeinterval if first_obs==1, detail
summ timeinterval_lastchange, detail

drop if median_timeinterval>365 
drop if median_timeinterval>182 
drop if median_timeinterval>90 
drop if median_timeinterval>30 
drop if median_timeinterval>12 

drop median_timeinterval


// 2.2 - Drop stations staying less than 6 months in the sample

bysort id_pdv (num_obs): egen min_day_maj=min(date_maj)
bysort id_pdv (num_obs): egen max_day_maj=max(date_maj)
gen number_days_insample=max_day_maj-min_day_maj

drop if number_days_insample<30 
drop if number_days_insample<60 
drop if number_days_insample<90 
drop if number_days_insample<120 
drop if number_days_insample<160 
drop if number_days_insample<180 

drop min_day_maj
drop max_day_maj 


// 3. If duration between two price changes is greater than 15/30/60 days, no price change is computed and a new identifier is created

* 3.1 Create a new identifier

capture drop number_changes_othertypes
bysort id_pdv (num_obs): gen number_changes_othertypes=num_obs-num_obs[_n-1]-1

capture drop tag_sup30
gen tag_sup30=0
replace tag_sup30=1 if timeinterval_lastchange>30 & timeinterval_lastchange!=. 

capture drop tag_sup15
gen tag_sup15=0
replace tag_sup15=1 if timeinterval_lastchange>15 & timeinterval_lastchange!=. 

capture drop tag_sup60
gen tag_sup60=0
replace tag_sup60=1 if timeinterval_lastchange>60 & timeinterval_lastchange!=. 




**** Generate a new ID equal to ID_PDV + Num_Obs

egen id_pdv_15_temp=concat(id_pdv num_obs) if tag_sup15==1, punct("_")
bysort id_pdv (num_obs): carryforward id_pdv_15_temp, gen(id_pdv_15)
replace id_pdv_15=id_pdv if id_pdv_15==""
capture drop id_pdv_15_temp

egen id_pdv_30_temp=concat(id_pdv num_obs) if tag_sup30==1, punct("_")
bysort id_pdv (num_obs): carryforward id_pdv_30_temp, gen(id_pdv_30)
replace id_pdv_30=id_pdv if id_pdv_30==""
capture drop id_pdv_30_temp

egen id_pdv_60_temp=concat(id_pdv num_obs) if tag_sup60==1, punct("_")
bysort id_pdv (num_obs): carryforward id_pdv_60_temp, gen(id_pdv_60)
replace id_pdv_60=id_pdv if id_pdv_60==""
capture drop id_pdv_60_temp


bysort id_pdv_15 (num_obs): egen min_day_maj15=min(date_maj)
bysort id_pdv_15 (num_obs): egen max_day_maj15=max(date_maj)
gen number_days_insample15=max_day_maj15-min_day_maj15

bysort id_pdv_30 (num_obs): egen min_day_maj30=min(date_maj)
bysort id_pdv_30 (num_obs): egen max_day_maj30=max(date_maj)
gen number_days_insample30=max_day_maj30-min_day_maj30

bysort id_pdv_60 (num_obs): egen min_day_maj60=min(date_maj)
bysort id_pdv_60 (num_obs): egen max_day_maj60=max(date_maj)
gen number_days_insample60=max_day_maj60-min_day_maj60

bysort id_pdv_15 (num_obs): gen first_obs15=1 if _n==1
bysort id_pdv_30 (num_obs): gen first_obs30=1 if _n==1
bysort id_pdv_60 (num_obs): gen first_obs60=1 if _n==1


***** Save data for figure A.1 in Appendix

preserve
keep if first_obs15==1
gen date_tokeep=_n+17166
gen number_open15=.
forvalues l=17167/21549{
gen open15_`l'=0
replace open15_`l'=1 if min_day_maj15<=`l' & max_day_maj15>=`l'
egen number_open15_`l'=sum(open15_`l')
replace number_open15=number_open15_`l' if date_tokeep==`l'
drop number_open15_`l' open15_`l'
display(`l')
}
keep if date_tokeep>=17167 & date_tokeep<=21549
keep date_tokeep number_open15
save "$path\Intermediary Data\Open_Gazole_bydate_15.dta", replace
restore


preserve
keep if first_obs30==1
gen date_tokeep=_n+17166
gen number_open30=.
forvalues l=17167/21549{
gen open30_`l'=0
replace open30_`l'=1 if min_day_maj30<=`l' & max_day_maj30>=`l'
egen number_open30_`l'=sum(open30_`l')
replace number_open30=number_open30_`l' if date_tokeep==`l'
drop number_open30_`l' open30_`l'
display(`l')
}
keep if date_tokeep>=17167 & date_tokeep<=21549
keep date_tokeep number_open30
save "$path\Intermediary Data\Open_Gazole_bydate_30.dta", replace
restore


preserve
keep if first_obs60==1
gen date_tokeep=_n+17166
gen number_open60=.
forvalues l=17167/21549{
gen open60_`l'=0
replace open60_`l'=1 if min_day_maj60<=`l' & max_day_maj60>=`l'
egen number_open60_`l'=sum(open60_`l')
replace number_open60=number_open60_`l' if date_tokeep==`l'
drop number_open60_`l' open60_`l'
display(`l')
}
keep if date_tokeep>=17167 & date_tokeep<=21549
keep date_tokeep number_open60
save "$path\Intermediary Data\Open_Gazole_bydate_60.dta", replace
restore



keep date_maj prix prix_ht id_pdv_30 num_obs

egen id_pdv_30num=group(id_pdv_30)

gen date_maj_num=date_maj
drop date_maj
drop num_obs

xtset id_pdv_30num date_maj_num

tsfill


bysort id_pdv_30num (date_maj_num): carryforward prix, replace
bysort id_pdv_30num (date_maj_num): carryforward prix_ht, replace
bysort id_pdv_30num (date_maj_num): carryforward id_pdv_30, replace


save "$path\Intermediary Data\Prix_Gazole30_Level_DailyFULL.dta", replace

